Mini-Project: SVM & LR Classification

2017-2018 California Department of Education Mathmematics Achievement
Created by An Nguyen, Andy Ho, Jodi Pafford, Tori Wheelis
February 06, 2019
In [117]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly as ply
from sklearn.model_selection import ShuffleSplit as ss
from sklearn.linear_model import LogisticRegression as lr
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler as sts
from sklearn.pipeline import Pipeline as pl
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from __future__ import print_function as pr

##working object will be read latter on
#rainfall_original = pd.read_csv('weatherAus.csv') 
In [118]:
#functions to find the average value using the bracketing values around the NaN, will only use values if they are 
#   from the same city.
#if NaN is the earliest timepoint for a given city the next timepoint with no NaN will be given instead of the mean
#if NaN is the latest timepoint for a given city the previous timepoint with no NaN will be given instead of the mean

def impute_by_city(cities,variables):
    for c in cities:
        temp = rainfall[rainfall.Location == c] #parse out observations from a single city
        #interate through all observations of the temp data file
        i = min(temp.index)
        while i <= max(temp.index):
            for v in variables:
                if pd.isna(temp[v]).all():
                    pass
                elif pd.isna(temp[v][i]):
                    temp[v][i] = find_mean(temp[v], i)
                    rainfall[v][i] = temp[v][i]
            i = i + 1       

def find_mean(templist, index):
    if index == min(templist.index): #if earliest timepoint take the next value that is not NaN
        return find_top(templist, index)
    elif index == max(templist.index): #if latest timepoint take the previous value that is not NaN
        return find_bottom(templist, index)
    else:
        bottom = find_bottom(templist, index) #find previous non-NaN value
        top = find_top(templist, index) #find next non-NaN value
    if pd.isna(top): #if there are no more non-NaN values return the previous non-NaN value
        return bottom
    else:
        mean = (top + bottom)/2
        return mean

#find previous non-NaN value
def find_bottom(templist, index):
    while pd.isna(templist[index-1]):
        index = index-1
    bottom = templist[index-1]
    return bottom

#find next non-NaN value
#if there are no more non-NaN values return the previous non-NaN value
def find_top(templist, index):
    while pd.isna(templist[index+1]):
        index = index+1
        if index == max(templist.index):
            top = np.nan
            return top
    top = templist[index+1]
    return top          
In [119]:
##can be skipped if rainfall.csv already generated!
#rainfall = rainfall_original.copy()
#rainfall.drop(["RISK_MM"], axis=1, inplace=True) #RISK_MM was used to extrapolate response variable.
#rainfall['Date'] =  pd.to_datetime(rainfall['Date'], format='%Y-%m-%d') #change 'Date' variable to datetime64
#rainfall.dropna(subset=["RainToday"], inplace=True) #drop any observation with no record of rainfall for the day,
                                                    #   cannot be imputed
#rainfall = rainfall.reset_index(drop=True) #reset the index after drops
#rainfall.info()
In [120]:
##can be skipped if rainfall.csv already generated!
##set the cardinal directions to degrees
#directions = {'N':0, 'NNE':22.5, 'NE':45, 'NE':45, 'ENE':67.5, 'E':90, 'ESE':112.5, 'SE':135, 'SSE':157.5, 'S':180,\
#              'SSW':202.5, 'SW':225, 'WSW':247.5, 'W':270, 'WNW':292.5, 'NW':315, 'NNW':337.5}
#cities = rainfall.Location.unique() #get name of all cities in the data frame
#c_variables = [] #variables with continuous values
#d_variables = [] #variables with discreet values
        
#rainfall = rainfall.replace(directions) #replace cardianl direction to their corresponding degrees

##change 'Yes' and 'No' to 1 and 0 respectively
#rainfall.RainToday = rainfall.RainToday=='Yes'
#rainfall.RainToday = rainfall.RainToday.astype(np.int)
#rainfall.RainTomorrow = rainfall.RainTomorrow=='Yes'
#rainfall.RainTomorrow = rainfall.RainTomorrow.astype(np.int)

#for l in list(rainfall):
#    if (rainfall[l].dtypes == 'float64'):
#        c_variables.append(l)
#    else:
#        d_variables.append(l)
In [121]:
##can be skipped if rainfall.csv already generated!
##very expensive, rainfall.csv can be uploaded from work directory
#impute_by_city(cities, c_variables) #impute values to NaN 
#rainfall.to_csv("rainfall.csv", sep=',', index=True) #save to csv for later use
In [122]:
#load pre-generated rainfall.csv file
rainfall = pd.read_csv('rainfall.csv', index_col=0) 
rainfall.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 140787 entries, 0 to 140786
Data columns (total 23 columns):
Date             140787 non-null object
Location         140787 non-null object
MinTemp          140787 non-null float64
MaxTemp          140787 non-null float64
Rainfall         140787 non-null float64
Evaporation      97184 non-null float64
Sunshine         89329 non-null float64
WindGustDir      134862 non-null float64
WindGustSpeed    134862 non-null float64
WindDir9am       140787 non-null float64
WindDir3pm       140787 non-null float64
WindSpeed9am     140787 non-null float64
WindSpeed3pm     140787 non-null float64
Humidity9am      140787 non-null float64
Humidity3pm      140787 non-null float64
Pressure9am      129190 non-null float64
Pressure3pm      129190 non-null float64
Cloud9am         107253 non-null float64
Cloud3pm         107253 non-null float64
Temp9am          140787 non-null float64
Temp3pm          140787 non-null float64
RainToday        140787 non-null int64
RainTomorrow     140787 non-null int64
dtypes: float64(19), int64(2), object(2)
memory usage: 25.8+ MB
In [123]:
rainfall = rainfall.drop(['Evaporation', 'Sunshine'], axis = 1) #dropped variable becuase there are too many NaN's
l = list(rainfall.Location.unique())
rainfall.dropna(subset = list(rainfall), inplace = True) #dropped city becuase variable was not recorded by said city
#check to see which city was droped
for i in l:
    if i not in rainfall.Location.unique():
        print(i)
rainfall = rainfall.drop(['Date', 'Location'], axis = 1) #not needed for prediction
BadgerysCreek
Newcastle
NorahHead
Penrith
Tuggeranong
MountGinini
Nhil
Dartmoor
GoldCoast
Adelaide
Albany
Witchcliffe
SalmonGums
Walpole

Training and Testing Split

In [124]:
# we want to predict the X and y data as follows:
if 'RainTomorrow' in rainfall:
    y = rainfall['RainTomorrow'].values # get the labels we want
    del rainfall['RainTomorrow'] # get rid of the class label
    x = rainfall.values # use everything else to predict!  
    
# split our data into training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ss(n_splits=num_cv_iterations, test_size  = 0.2)
                         
print(cv_object)
ShuffleSplit(n_splits=5, random_state=None, test_size=0.2, train_size=None)

Logistic Regression

In [125]:
# iterate over the coefficients
column_names = rainfall.columns
weights = []
weights_array = []

scl_obj = sts()
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(x,y)):
    scl_obj.fit(x[train_indices]) # find scalings for each column that make this zero mean and unit std

    X_train_scaled = scl_obj.transform(x[train_indices]) # apply to training
    X_test_scaled = scl_obj.transform(x[test_indices]) # apply those means and std to the test set (without snooping at the test set values)

    # train the model just as before
    lr_clf = lr(penalty='l2', C=0.05) # get object, the 'C' value is less (can you guess why??)
    lr_clf.fit(X_train_scaled,y[train_indices])  # train object

    y_hat = lr_clf.predict(X_test_scaled) # get test set precitions
    
    acc = mt.accuracy_score(y[test_indices],y_hat)
    conf = mt.confusion_matrix(y[test_indices],y_hat)
    print("")
    print('accuracy:', acc )
    print(conf )

    # sort these attributes and spit them out
    #zip_vars = zip(lr_clf.coef_.T,column_names) # combine attributes
    #zip_vars = sorted(zip_vars)
    zip_vars = pd.Series(lr_clf.coef_[0].T, index=column_names)
    for name, coef in zip_vars.items():
        print(name, 'has weight of', coef) # now print them out
        weights.append(coef)
    weights_array.append(weights)
    weights = []
weights_array = np.array(weights_array)
accuracy: 0.8532024079739465
[[15092   781]
 [ 2194  2199]]
MinTemp has weight of 0.05803651692818271
MaxTemp has weight of 0.13156914423384047
Rainfall has weight of 0.08387828168701633
WindGustDir has weight of 0.030014349696069382
WindGustSpeed has weight of 0.7128706550699504
WindDir9am has weight of -0.10510024124641003
WindDir3pm has weight of 0.07711947307900285
WindSpeed9am has weight of -0.06793149007127725
WindSpeed3pm has weight of -0.2576198083741177
Humidity9am has weight of 0.10515518844411331
Humidity3pm has weight of 1.1264699023052565
Pressure9am has weight of 1.039997534123468
Pressure3pm has weight of -1.4582203645527392
Cloud9am has weight of 0.15021140840366332
Cloud3pm has weight of 0.37308529576263133
Temp9am has weight of 0.19503773447706244
Temp3pm has weight of -0.414527746685684
RainToday has weight of 0.1953884238072875

accuracy: 0.8569031876048554
[[15216   776]
 [ 2124  2150]]
MinTemp has weight of 0.0882172349881368
MaxTemp has weight of 0.09250179814305856
Rainfall has weight of 0.08176378341189892
WindGustDir has weight of 0.0338641125460064
WindGustSpeed has weight of 0.7050144143827994
WindDir9am has weight of -0.10686252712895199
WindDir3pm has weight of 0.08151642762357922
WindSpeed9am has weight of -0.0713530628207336
WindSpeed3pm has weight of -0.25018446534998506
Humidity9am has weight of 0.07582103794685141
Humidity3pm has weight of 1.1316702847901814
Pressure9am has weight of 1.0651600207595324
Pressure3pm has weight of -1.4781177875882427
Cloud9am has weight of 0.1559953541581746
Cloud3pm has weight of 0.3685242271177231
Temp9am has weight of 0.15287066077930944
Temp3pm has weight of -0.3745694004002604
RainToday has weight of 0.19740997392159015

accuracy: 0.8546827198263101
[[15171   790]
 [ 2155  2150]]
MinTemp has weight of 0.055410023541420314
MaxTemp has weight of 0.0934336388872103
Rainfall has weight of 0.07813698725757526
WindGustDir has weight of 0.033095541107531816
WindGustSpeed has weight of 0.7086388222545223
WindDir9am has weight of -0.11216091238829191
WindDir3pm has weight of 0.08275724270405116
WindSpeed9am has weight of -0.06147891673162218
WindSpeed3pm has weight of -0.25517881612831345
Humidity9am has weight of 0.08537801180429712
Humidity3pm has weight of 1.1337400185492899
Pressure9am has weight of 1.0753051289196591
Pressure3pm has weight of -1.484125695503563
Cloud9am has weight of 0.15785574885394948
Cloud3pm has weight of 0.3676839969330465
Temp9am has weight of 0.1902466832110732
Temp3pm has weight of -0.3726972217713857
RainToday has weight of 0.20599489864808013

accuracy: 0.8538438764433041
[[15171   764]
 [ 2198  2133]]
MinTemp has weight of 0.054847367335200625
MaxTemp has weight of 0.13467183500658073
Rainfall has weight of 0.09151589979085789
WindGustDir has weight of 0.021516182786393313
WindGustSpeed has weight of 0.7016193967910415
WindDir9am has weight of -0.11379978560843138
WindDir3pm has weight of 0.0972680517757566
WindSpeed9am has weight of -0.06561592177622473
WindSpeed3pm has weight of -0.25762792423955766
Humidity9am has weight of 0.07922922984781304
Humidity3pm has weight of 1.1192522684576078
Pressure9am has weight of 1.0516579771995362
Pressure3pm has weight of -1.470694409736601
Cloud9am has weight of 0.16502461106427394
Cloud3pm has weight of 0.3674042150257026
Temp9am has weight of 0.18953309528404547
Temp3pm has weight of -0.4214498937998311
RainToday has weight of 0.19868498160961418

accuracy: 0.8581367808151584
[[15232   789]
 [ 2086  2159]]
MinTemp has weight of 0.06664477028064052
MaxTemp has weight of 0.061492074425887916
Rainfall has weight of 0.07935234284116094
WindGustDir has weight of 0.0369962416517932
WindGustSpeed has weight of 0.7070157199155226
WindDir9am has weight of -0.10377743397162154
WindDir3pm has weight of 0.08186828474826524
WindSpeed9am has weight of -0.0652384705110684
WindSpeed3pm has weight of -0.25753477423181625
Humidity9am has weight of 0.09027335026651485
Humidity3pm has weight of 1.1348400880069378
Pressure9am has weight of 1.05515671470215
Pressure3pm has weight of -1.474783567699629
Cloud9am has weight of 0.16177590236435785
Cloud3pm has weight of 0.36863952150855755
Temp9am has weight of 0.1812385866918331
Temp3pm has weight of -0.3401021440599143
RainToday has weight of 0.19154529260042363
In [126]:
ply.offline.init_notebook_mode() # run at the start of every notebook

mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])

error_y=dict(
            type='data',
            array=final_array['std'].values,
            visible=True
        )

graph1 = {'x': final_array.index,
          'y': final_array['mean'].values,
    'error_y':error_y,
       'type': 'bar'}

fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Logistic Regression Weights, with error bars'}

ply.offline.iplot(fig)

Support Vector Machines

In [127]:
weights = []
weights_array = []

# okay, so run through the cross validation loop and set the training and testing variable for one single iteration
for train_indices, test_indices in cv_object.split(x,y): 
    # I will create new variables here so that it is more obvious what 
    # the code is doing (you can compact this syntax and avoid duplicating memory,
    # but it makes this code less readable)
    X_train = x[train_indices]
    y_train = y[train_indices]
    
    X_test = x[test_indices]
    y_test = y[test_indices]
    
    X_train_scaled = scl_obj.transform(X_train) # apply to training
    X_test_scaled = scl_obj.transform(X_test) 
    
    #train the model just as before
    svm_clf = SVC(C=0.5, kernel='linear', degree=3, gamma='auto') # get object
    svm_clf.fit(X_train_scaled, y_train)  # train object

    y_hat = svm_clf.predict(X_test_scaled) # get test set precitions

    acc = mt.accuracy_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    print("")
    print('accuracy:', acc )
    print(conf)
    
    # sort these attributes and spit them out
    zip_vars = pd.Series(svm_clf.coef_[0],index=column_names) # combine attributes
    for name, coef in zip_vars.items():
        print(name, 'has weight of', coef) # now print them out
        weights.append(coef)
    weights_array.append(weights)
    weights = []
weights_array = np.array(weights_array)
accuracy: 0.852560939504589
[[15309   706]
 [ 2282  1969]]
MinTemp has weight of -0.04109260175403051
MaxTemp has weight of 0.28948419872233444
Rainfall has weight of 0.10351868718692003
WindGustDir has weight of 0.019921199263876588
WindGustSpeed has weight of 0.46595986178590465
WindDir9am has weight of -0.07694249096709882
WindDir3pm has weight of 0.07679764895939911
WindSpeed9am has weight of -0.016733871515214105
WindSpeed3pm has weight of -0.20864580431577906
Humidity9am has weight of -0.012621898256156783
Humidity3pm has weight of 0.8327080325057068
Pressure9am has weight of 0.7579048109382711
Pressure3pm has weight of -1.007606085531279
Cloud9am has weight of 0.07892486405739874
Cloud3pm has weight of 0.1745759923528567
Temp9am has weight of 0.02922994445189886
Temp3pm has weight of -0.30505726143883294
RainToday has weight of 0.13264741138095815

accuracy: 0.8487121286884437
[[15222   718]
 [ 2348  1978]]
MinTemp has weight of -0.03788781065782132
MaxTemp has weight of 0.26265351094434664
Rainfall has weight of 0.10176901551767514
WindGustDir has weight of 0.013092007019963603
WindGustSpeed has weight of 0.4743111545157035
WindDir9am has weight of -0.07254627420894622
WindDir3pm has weight of 0.07756958644948497
WindSpeed9am has weight of -0.025162655367751086
WindSpeed3pm has weight of -0.21159872741645813
Humidity9am has weight of -0.015392553318861246
Humidity3pm has weight of 0.8453511147440622
Pressure9am has weight of 0.7435645710311292
Pressure3pm has weight of -0.995600155890429
Cloud9am has weight of 0.07157225436503722
Cloud3pm has weight of 0.16513418077738606
Temp9am has weight of 0.020077896372043824
Temp3pm has weight of -0.27020801699779895
RainToday has weight of 0.13660401854076554

accuracy: 0.854238626270601
[[15343   688]
 [ 2266  1969]]
MinTemp has weight of -0.046435269010828506
MaxTemp has weight of 0.3293742185987867
Rainfall has weight of 0.09967342013317193
WindGustDir has weight of 0.017106944437671245
WindGustSpeed has weight of 0.48211599676290007
WindDir9am has weight of -0.07983771856686417
WindDir3pm has weight of 0.07005908881171763
WindSpeed9am has weight of -0.02290122705340991
WindSpeed3pm has weight of -0.2235645476831678
Humidity9am has weight of -0.009665835802934453
Humidity3pm has weight of 0.8308050728674061
Pressure9am has weight of 0.7638986723745802
Pressure3pm has weight of -1.0075785134212083
Cloud9am has weight of 0.07161290232079409
Cloud3pm has weight of 0.17155246112724853
Temp9am has weight of 0.024938451062865852
Temp3pm has weight of -0.3423023233051481
RainToday has weight of 0.13549721302229045

accuracy: 0.8534984703444192
[[15332   655]
 [ 2314  1965]]
MinTemp has weight of -0.05073184779701023
MaxTemp has weight of 0.30071462565126694
Rainfall has weight of 0.10286346904365473
WindGustDir has weight of 0.009515281881988358
WindGustSpeed has weight of 0.4692166574915291
WindDir9am has weight of -0.07561612206822588
WindDir3pm has weight of 0.07437025622343185
WindSpeed9am has weight of -0.024516614897777345
WindSpeed3pm has weight of -0.21465258719899794
Humidity9am has weight of -0.014659716297728664
Humidity3pm has weight of 0.8247772579593402
Pressure9am has weight of 0.7413524699340996
Pressure3pm has weight of -0.9948442475306365
Cloud9am has weight of 0.07900677356610686
Cloud3pm has weight of 0.17435999016583992
Temp9am has weight of 0.024392764480808182
Temp3pm has weight of -0.3070565066589097
RainToday has weight of 0.1350831748022756

accuracy: 0.8569031876048554
[[15308   681]
 [ 2219  2058]]
MinTemp has weight of -0.0236310470928629
MaxTemp has weight of 0.28156731589729134
Rainfall has weight of 0.09781924562378208
WindGustDir has weight of 0.013685044314570405
WindGustSpeed has weight of 0.47352620834783465
WindDir9am has weight of -0.07926766731122825
WindDir3pm has weight of 0.07288860482091764
WindSpeed9am has weight of -0.036381835827398845
WindSpeed3pm has weight of -0.2131419870974014
Humidity9am has weight of -0.004309385199803728
Humidity3pm has weight of 0.810846037103147
Pressure9am has weight of 0.7456258643371712
Pressure3pm has weight of -0.9919023623124303
Cloud9am has weight of 0.07145985953843592
Cloud3pm has weight of 0.17526443810879755
Temp9am has weight of 0.044526334470987194
Temp3pm has weight of -0.3333000763441305
RainToday has weight of 0.1370687293363062
In [128]:
# look at the support vectors
print(svm_clf.support_vectors_.shape)
print(svm_clf.support_.shape)
print(svm_clf.n_support_ )
(28324, 18)
(28324,)
[14165 14159]
In [129]:
ply.offline.init_notebook_mode() # run at the start of every notebook

mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])

error_y=dict(
            type='data',
            array=final_array['std'].values,
            visible=True
        )

graph1 = {'x': final_array.index,
          'y': final_array['mean'].values,
    'error_y':error_y,
       'type': 'bar'}

fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Support Vector Machines Weights, with error bars'}

ply.offline.iplot(fig)
In [130]:
# Now let's do some different analysis with the SVM and look at the instances that were chosen as support vectors

# now lets look at the support for the vectors and see if we they are indicative of anything
# grabe the rows that were selected as support vectors (these are usually instances that are hard to classify)

# make a dataframe of the training data
df_tested_on = rainfall.iloc[train_indices] # saved from above, the indices chosen for training
# now get the support vectors from the trained model
df_support = df_tested_on.iloc[svm_clf.support_,:]

df_support['RainTomorrow'] = y[svm_clf.support_] # add back in the 'Survived' Column to the pandas dataframe
rainfall['RainTomorrow'] = y # also add it back in for the original data
df_support.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28324 entries, 10508 to 81536
Data columns (total 19 columns):
MinTemp          28324 non-null float64
MaxTemp          28324 non-null float64
Rainfall         28324 non-null float64
WindGustDir      28324 non-null float64
WindGustSpeed    28324 non-null float64
WindDir9am       28324 non-null float64
WindDir3pm       28324 non-null float64
WindSpeed9am     28324 non-null float64
WindSpeed3pm     28324 non-null float64
Humidity9am      28324 non-null float64
Humidity3pm      28324 non-null float64
Pressure9am      28324 non-null float64
Pressure3pm      28324 non-null float64
Cloud9am         28324 non-null float64
Cloud3pm         28324 non-null float64
Temp9am          28324 non-null float64
Temp3pm          28324 non-null float64
RainToday        28324 non-null int64
RainTomorrow     28324 non-null int64
dtypes: float64(17), int64(2)
memory usage: 4.3 MB
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [131]:
# now lets see the statistics of these attributes
from pandas.tools.plotting import boxplot

# group the original data and the support vectors
df_grouped_support = df_support.groupby(['RainTomorrow'])
df_grouped = rainfall.groupby(['RainTomorrow'])

# plot KDE of Different variables
vars_to_plot = column_names

for v in vars_to_plot:
    plt.figure(figsize=(10,4))
    # plot support vector stats
    plt.subplot(1,2,1)
    ax = df_grouped_support[v].plot.kde() 
    plt.legend(['no rain','rained'])
    plt.title(v+' (Instances chosen as Support Vectors)')
    
    # plot original distributions
    plt.subplot(1,2,2)
    ax = df_grouped[v].plot.kde() 
    plt.legend(['no rain','rained'])
    plt.title(v+' (Original)')